setwd("~/HomeWork/Practice")
devtools::install_github("kassambara/datarium")
Skipping install of 'datarium' from a github remote, the SHA1 (f9f9b3a6) has not changed since last install.
Use `force = TRUE` to force installation
data("marketing", package="datarium")
head(marketing)
data("swiss")
head(swiss)
data("Boston", package="MASS")
head(Boston)
Loading Required R packages
library(tidyverse)
library(caret)
theme_set(theme_bw())
Preparing the data
# Load the data
data("marketing", package = "datarium")
# Inspect th data
sample_n(marketing,3)
Split the data into training and test set
set.seed(123)
training.samples <- marketing$sales %>% createDataPartition(p=0.8, list = FALSE)
train.data <- marketing[training.samples,]
test.data <- marketing[-training.samples,]
summary(train.data)
youtube facebook newspaper sales
Min. : 0.84 Min. : 0.00 Min. : 0.36 Min. : 1.92
1st Qu.: 85.95 1st Qu.:11.61 1st Qu.: 19.11 1st Qu.:12.48
Median :196.08 Median :25.44 Median : 35.16 Median :15.48
Mean :179.24 Mean :27.51 Mean : 39.07 Mean :16.77
3rd Qu.:264.54 3rd Qu.:43.89 3rd Qu.: 55.02 3rd Qu.:20.88
Max. :355.68 Max. :59.52 Max. :136.80 Max. :32.40
dim(train.data)
[1] 162 4
dim(test.data)
[1] 38 4
Computing linear regression in r
model <- lm(sales ~., data = train.data)
summary(model)
Call:
lm(formula = sales ~ ., data = train.data)
Residuals:
Min 1Q Median 3Q Max
-10.4122 -1.1101 0.3475 1.4218 3.4987
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.391883 0.440622 7.698 1.41e-12 ***
youtube 0.045574 0.001592 28.630 < 2e-16 ***
facebook 0.186941 0.009888 18.905 < 2e-16 ***
newspaper 0.001786 0.006773 0.264 0.792
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.119 on 158 degrees of freedom
Multiple R-squared: 0.8902, Adjusted R-squared: 0.8881
F-statistic: 426.9 on 3 and 158 DF, p-value: < 2.2e-16
# Make predictios
predictions <- model %>% predict(test.data)
# Model Performance
# (a) Prediction error, RMSE
RMSE(predictions, test.data$sales)
[1] 1.590096
# (b) R-square
R2(predictions, test.data$sales)
[1] 0.9380644
data("Boston", package = "MASS")
set.seed(123)
head(Boston)
training1.samples <- Boston$medv %>% createDataPartition(p=0.8, list = FALSE)
train1.data <- Boston[training.samples,]
test1.data <- Boston[-training.samples, ]
ggplot(train1.data, aes(lstat, medv)) +
geom_point() +
stat_smooth()
Let’s start with fitting a linear regression model
model.lm <- lm(medv~lstat, data = train1.data)
summary(model.lm)
Call:
lm(formula = medv ~ lstat, data = train1.data)
Residuals:
Min 1Q Median 3Q Max
-7.889 -3.805 -1.656 2.059 19.972
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 33.98512 0.88151 38.55 <2e-16 ***
lstat -0.88925 0.06524 -13.63 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5.548 on 160 degrees of freedom
Multiple R-squared: 0.5373, Adjusted R-squared: 0.5344
F-statistic: 185.8 on 1 and 160 DF, p-value: < 2.2e-16
prediction <- model.lm %>% predict(test1.data)
data.frame(RMSE = RMSE(prediction, test1.data$medv),
R2 = R2(prediction, test1.data$medv))
Let’s visualize the data
ggplot(train1.data, aes(lstat, medv)) +
geom_point() +
stat_smooth(method = lm, formula = y ~ x)
medv = b0 + b1 ∗ lstat + b2 ∗ lstat2
mod2 <- lm(medv ~ lstat + I(lstat*lstat), data = train1.data)
summary(mod2)
Call:
lm(formula = medv ~ lstat + I(lstat * lstat), data = train1.data)
Residuals:
Min 1Q Median 3Q Max
-8.3393 -3.2427 -0.5934 2.0975 16.7311
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 43.432475 1.274260 34.084 < 2e-16 ***
lstat -2.520720 0.189206 -13.323 < 2e-16 ***
I(lstat * lstat) 0.053205 0.005921 8.987 7.09e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.532 on 159 degrees of freedom
Multiple R-squared: 0.6932, Adjusted R-squared: 0.6893
F-statistic: 179.6 on 2 and 159 DF, p-value: < 2.2e-16
prediction2 <- mod2 %>% predict(test1.data)
data.frame(
RMSE = RMSE(prediction2, test1.data$medv),
R2 = R2(prediction2, test1.data$medv)
)
mod22 <- lm(medv ~ poly(lstat,2), data = train1.data)
summary(mod22)
Call:
lm(formula = medv ~ poly(lstat, 2), data = train1.data)
Residuals:
Min 1Q Median 3Q Max
-8.3393 -3.2427 -0.5934 2.0975 16.7311
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 23.5407 0.3561 66.116 < 2e-16 ***
poly(lstat, 2)1 -75.6187 4.5318 -16.686 < 2e-16 ***
poly(lstat, 2)2 40.7253 4.5318 8.987 7.09e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.532 on 159 degrees of freedom
Multiple R-squared: 0.6932, Adjusted R-squared: 0.6893
F-statistic: 179.6 on 2 and 159 DF, p-value: < 2.2e-16
prediction22 <- mod22 %>% predict(test1.data)
data.frame(
RMSE = RMSE(prediction22, test1.data$medv),
R2 = R2(prediction22, test1.data$medv)
)
ggplot(train1.data, aes(lstat, medv)) +
geom_point() +
stat_smooth(method = lm, formula = y ~ poly(x,2))
ggplot(train1.data, aes(lstat, medv)) +
geom_point() +
stat_smooth(method = lm, formula = y ~ (x + I(x*x)))
mod6 <- lm(medv ~ poly(lstat, 6), data = train1.data)
summary(mod6)
Call:
lm(formula = medv ~ poly(lstat, 6), data = train1.data)
Residuals:
Min 1Q Median 3Q Max
-12.4412 -2.3739 -0.7595 1.7344 15.9725
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 23.5407 0.3304 71.245 < 2e-16 ***
poly(lstat, 6)1 -75.6187 4.2055 -17.981 < 2e-16 ***
poly(lstat, 6)2 40.7253 4.2055 9.684 < 2e-16 ***
poly(lstat, 6)3 -16.9822 4.2055 -4.038 8.45e-05 ***
poly(lstat, 6)4 12.1006 4.2055 2.877 0.00458 **
poly(lstat, 6)5 -9.1610 4.2055 -2.178 0.03089 *
poly(lstat, 6)6 -2.2956 4.2055 -0.546 0.58595
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.206 on 155 degrees of freedom
Multiple R-squared: 0.7424, Adjusted R-squared: 0.7324
F-statistic: 74.45 on 6 and 155 DF, p-value: < 2.2e-16
As we can see in above model summary the polynomial term beyond 5 is not significant
Let’s visualize model
ggplot(train1.data, aes(lstat, medv)) +
geom_point() +
stat_smooth(method = lm, formula = y ~ poly(x,6))
mod5 <- lm(medv ~ poly(lstat, 5), data = train1.data)
summary(mod5)
Call:
lm(formula = medv ~ poly(lstat, 5), data = train1.data)
Residuals:
Min 1Q Median 3Q Max
-13.0370 -2.4259 -0.7941 1.6414 16.1361
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 23.5407 0.3297 71.406 < 2e-16 ***
poly(lstat, 5)1 -75.6187 4.1961 -18.021 < 2e-16 ***
poly(lstat, 5)2 40.7253 4.1961 9.706 < 2e-16 ***
poly(lstat, 5)3 -16.9822 4.1961 -4.047 8.14e-05 ***
poly(lstat, 5)4 12.1006 4.1961 2.884 0.00448 **
poly(lstat, 5)5 -9.1610 4.1961 -2.183 0.03051 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.196 on 156 degrees of freedom
Multiple R-squared: 0.7419, Adjusted R-squared: 0.7336
F-statistic: 89.69 on 5 and 156 DF, p-value: < 2.2e-16
ggplot(train1.data, aes(lstat, medv)) +
geom_point() +
stat_smooth(method = lm, formula = y ~ poly(x,5))
mod.log <- lm(medv ~ log(lstat), data = train1.data)
summary(mod.log)
Call:
lm(formula = medv ~ log(lstat), data = train1.data)
Residuals:
Min 1Q Median 3Q Max
-7.610 -2.975 -1.048 1.886 17.093
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 50.2130 1.4077 35.67 <2e-16 ***
log(lstat) -11.5918 0.5928 -19.55 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.43 on 160 degrees of freedom
Multiple R-squared: 0.705, Adjusted R-squared: 0.7032
F-statistic: 382.4 on 1 and 160 DF, p-value: < 2.2e-16
prediction.log <- mod.log %>% predict(test1.data)
data.frame(
RMSE = RMSE(prediction.log, test1.data$medv),
R2 = R2(prediction.log, test1.data$medv)
)
ggplot(data = train1.data, aes(lstat, medv))+
geom_point() +
stat_smooth(method = lm, formula = y~log(x))
library(splines)
knots <- quantile(train1.data$lstat, p = c(0.25, 0.5, 0.75))
model.spline <- lm(medv ~bs(lstat,knots=knots), data = train1.data)
summary(model.spline)
Call:
lm(formula = medv ~ bs(lstat, knots = knots), data = train1.data)
Residuals:
Min 1Q Median 3Q Max
-9.0823 -2.0512 -0.6894 1.8034 15.8245
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 44.032 2.643 16.658 < 2e-16 ***
bs(lstat, knots = knots)1 -1.127 4.160 -0.271 0.787
bs(lstat, knots = knots)2 -22.744 2.831 -8.034 2.23e-13 ***
bs(lstat, knots = knots)3 -19.776 3.102 -6.376 1.98e-09 ***
bs(lstat, knots = knots)4 -33.442 3.570 -9.366 < 2e-16 ***
bs(lstat, knots = knots)5 -27.995 4.901 -5.713 5.53e-08 ***
bs(lstat, knots = knots)6 -29.583 4.500 -6.574 7.08e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.097 on 155 degrees of freedom
Multiple R-squared: 0.7555, Adjusted R-squared: 0.746
F-statistic: 79.82 on 6 and 155 DF, p-value: < 2.2e-16
prediction.spline <- model.spline %>% predict(test1.data)
some 'x' values beyond boundary knots may cause ill-conditioned bases
data.frame(
RMSE = RMSE(prediction.spline, test1.data$medv),
R2 = R2(prediction.spline, test1.data$medv)
)
ggplot(train1.data, aes(lstat, medv)) +
geom_point() +
stat_smooth(method = lm, formula = y~bs(x,df=3))
library(mgcv)
Loading required package: nlme
Attaching package: ‘nlme’
The following object is masked from ‘package:dplyr’:
collapse
This is mgcv 1.8-27. For overview type 'help("mgcv-package")'.
model.additive <- gam(medv ~ s(lstat), data = train1.data)
summary(model.additive)
Family: gaussian
Link function: identity
Formula:
medv ~ s(lstat)
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 23.5407 0.3261 72.18 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(lstat) 5.825 7.01 65.16 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
R-sq.(adj) = 0.739 Deviance explained = 74.9%
GCV = 17.99 Scale est. = 17.232 n = 162
prediction.additive <- model.additive %>% predict(test1.data)
data.frame(
RMSE = RMSE(prediction.additive, test1.data$medv),
R2 = R2(prediction.additive, test1.data$medv)
)
ggplot(train1.data, aes(lstat, medv)) +
geom_point() +
stat_smooth(method = gam, formula = y ~ s(x))
In regression model, the most commonly known evaluation metrics include:
R-squared (R2), which is the proportion of variation in the outcome that is explained by the predictor variables. In multiple regression models, R2 corresponds to the squared correlation between the observed outcome values and the predicted values by the model. The Higer the R-squared, the better the model.
Root Mean Squared Error (RMSE), which measures the average error performed by the model in the predicting the outcome for an observation. Mathematically, the RMSE is the square root of the mean squared error (MSE), which is the average squared difference between the observed actual outcome values and the values predicted by the model. So, MSE = mean((observeds - predicteds)^2) and RMSE = sqrt(MSE). The lower the RMSE, the better the model.
Residual Standard Error (RSE), also known as the model sigma, is a variant of the RMSE adjusted for the number of predictors in the model. The lower the RSE, the better the model. In practice, the difference between RMSE and RSE is very small, particularly for large multivariate data.
Mean Absolute Error (MAE), like the RMSE, the MAE measures the prediction error. Mathematically, it is the average absolute difference between obsered and predicted out-comes, MAE = mean(abs(observeds - predicteds)) . MAE is less sensitive to outliers compared to RMSE.
The problem with the above metrics, is that they are sensible to the inclusion of additional variables in the model, even if those variables don’t have significant contribution in explaining the outcome. Put in other words, including additional variables in the model will always increase the R2 and reduce the RMSE. So, we need a more robust metric to guide the model choice.
Concerning R2, there is an adjusted version, called Adjusted R-squared, which adjusts the R2 for having too many variables in the model.
Additionally, there are four other important metrics - AIC, AICc, BIC and Mallows Cp - tha are commonly used for model evaluation and selection. These are an unbiased estimate of the model prediction error MSE. The lower these metrics, he better the model.
AIC stands for (Akike’s Information Criteria), a metric developeed by the Japanese Statistician, Hirotugu Akaike, 1970. The basic idea of AIC is to penalize the inclusion of additional variables to a model. It adds a penalty that increases the error when including additional terms. The lowwer the AIC, the better the model.
AICc is a version of AIC corrected for small sample sizes.
BIC (or Bayesian information criteria) is a variant of AIC with a strong penalty for including additional variables to the model.
Mallows Cp: Avariant of AIC developed by Colin Mallows.
Generally, most commonly used metrics, for measuring regression model quality and comparing models, are: Adjusted R2, AIC, BIC and Cp
In the following sections, we’ll sho you how to compute these above mentioned metrics.
library(modelr)
library(broom)
Attaching package: ‘broom’
The following object is masked from ‘package:modelr’:
bootstrap
# Load the data
data("swiss")
# Inspect the data
sample_n(swiss,3)
We start by creating two models:
model.swiss.1 <- lm(Fertility~., data = swiss)
summary(model.swiss.1)
Call:
lm(formula = Fertility ~ ., data = swiss)
Residuals:
Min 1Q Median 3Q Max
-15.2743 -5.2617 0.5032 4.1198 15.3213
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 66.91518 10.70604 6.250 1.91e-07 ***
Agriculture -0.17211 0.07030 -2.448 0.01873 *
Examination -0.25801 0.25388 -1.016 0.31546
Education -0.87094 0.18303 -4.758 2.43e-05 ***
Catholic 0.10412 0.03526 2.953 0.00519 **
Infant.Mortality 1.07705 0.38172 2.822 0.00734 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 7.165 on 41 degrees of freedom
Multiple R-squared: 0.7067, Adjusted R-squared: 0.671
F-statistic: 19.76 on 5 and 41 DF, p-value: 5.594e-10
data.frame(
R2 = rsquare(model.swiss.1, data = swiss),
RMSE = rmse(model.swiss.1, data = swiss),
MAE = mae(model.swiss.1, data = swiss),
AIC = AIC(model.swiss.1),
BIC = BIC(model.swiss.1)
)
model.swiss.2 <- lm(Fertility~. - Examination, data = swiss)
summary(model.swiss.2)
Call:
lm(formula = Fertility ~ . - Examination, data = swiss)
Residuals:
Min 1Q Median 3Q Max
-14.6765 -6.0522 0.7514 3.1664 16.1422
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 62.10131 9.60489 6.466 8.49e-08 ***
Agriculture -0.15462 0.06819 -2.267 0.02857 *
Education -0.98026 0.14814 -6.617 5.14e-08 ***
Catholic 0.12467 0.02889 4.315 9.50e-05 ***
Infant.Mortality 1.07844 0.38187 2.824 0.00722 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 7.168 on 42 degrees of freedom
Multiple R-squared: 0.6993, Adjusted R-squared: 0.6707
F-statistic: 24.42 on 4 and 42 DF, p-value: 1.717e-10
data.frame(
R2 = rsquare(model.swiss.2, data = swiss),
RMSE = rmse(model.swiss.2, data = swiss),
MAE = mae(model.swiss.2, data = swiss),
AIC = AIC(model.swiss.2),
BIC = BIC(model.swiss.2)
)
glance(model.swiss.1)
glance(model.swiss.2)
Manual computation of R2, RMSE and MAE
swiss %>% add_predictions(model.swiss.1) %>% summarise(
R2 = cor(Fertility, pred)^2,
MSE = mean(Fertility - pred)^2,
RMSE = sqrt(MSE),
MAE = mean(abs(Fertility - pred))
)
glance(model.swiss.1) %>% select(adj.r.squared, sigma, AIC, BIC, p.value)
glance(model.swiss.2) %>% select(adj.r.squared, sigma, AIC, BIC, p.value)
From the output above, it can be seen that:
The two models have exactly the samed Adjusted R2 (0.67), meaning that they are equivalent in explaining the outcome, here fertility score. Additionally, they have the same amount of residual standard error (RSE or sigma = 7.17). However, the model 2 is more simple than model 1 because it incorporates less variables. All things equal, the simple model is always better in statistics.
The AIC and the BIC of the model 2 are lower than those of the model1. In model comparison strategies, the model with the lowest AIC and BIC score is preferred.
Finally, the F-statistic p.value of the model 2 is lower than the one of the model 1. This means that the model 2 is statistically more significant compared to model 1, which is consistent to the avove conclusion.
Note that, the RMSE and the RSE are measured in the same scale as the outcome variable. Dividing the RSE by the average value of the outcome variable will give you the prediction error rate, which should be as small as possible:
sigma(model.swiss.1)/mean(swiss$Fertility)
[1] 0.1021544
In our example the average prediction error rate is 10%
Cross-validation refers to a set of methods for measuring the performance of a given predictive model on new test data sets.
The basic idea, behind cross-validation techniques, consists of dividing the data into two sets:
Cross-validation is also known as a resampling method because it involves fitting the same statistical method multiple times using different subsets of the data.
The different cross-validation methods for assessing model performance. We cover the following approches:
After building a model, we are interested in determining the accuracy of this model on predicting the outcome for new unseen observations not used to build the model. Put in other words, we want to estimate the prediction error.
To do so, the basic strategy is to:
Briefly, cross-validation algorithms can be summarized as follow:
The following sections describe the diffferent cross-validation techniques.
The validation set approach consists of randomly splitting the data into two sets: one set is used to train the model and the remaining other set is used to test the model.
The process works as follow:
The example below splits the swiss data set so that 80% is used for training a linear regression model and 20% is used to evaluate the model performance.
# Split the data into training and test set
set.seed(123)
training.samples <- swiss$Fertility %>%
createDataPartition(p=0.8, list = FALSE)
train.swiss.data <- swiss[training.samples, ]
test.swiss.data <- swiss[-training.samples, ]
# Build the model
model.swiss.lm <- lm(Fertility ~ ., data = train.swiss.data)
# Make predictions and compute the R2, RMSE and MAE
predictions.swiss.lm <- model.swiss.lm %>% predict(test.swiss.data)
data.frame(
R2 = R2(predictions.swiss.lm, test.swiss.data$Fertility),
RMSE = RMSE(predictions.swiss.lm, test.swiss.data$Fertility),
MAE = MAE(predictions.swiss.lm, test.swiss.data$Fertility)
)
When comparing two models, the one that produces the lowest test sample RMSE is the preferred model.
The RMSE and the MAE are measured in the same scale as the outcome variable. Dividing the RMSE by the average value of the outcome variable will give you the prediction error rate, which sould be as small as possible:
RMSE(predictions.swiss.lm, test.swiss.data$Fertility)/mean(test.swiss.data$Fertility)
[1] 0.1281514
Note that, the validation set method is only useful when you have a large data set that can be partitioned. A disadvantage is that we build a model on a fraction of the data set only, possibly leaving out some interesting information about data, leading to higher bias. Therefore, the test error rate can be highly variable, depending on which observations are included in the training set and which observation are included in the validation set.
This method works as follows: 1. Leave out one data point and build the model on the rest of the data set 2. Test the model against the data point that is left out at step 1 and record the test error associated with the prediction 3. Repeat the process for all data points 4. Compoute the overall prediction error by taking the average of all these test error estimates recoreded at step3.
Practical example in R using the caret package:
# Define training control
train.control <- trainControl(method = "LOOCV")
# Train the model
model.loocv <- train(Fertility ~., data = swiss, method = "lm", trControl = train.control)
# Summarize the results
print(model.loocv)
Linear Regression
47 samples
5 predictor
No pre-processing
Resampling: Leave-One-Out Cross-Validation
Summary of sample sizes: 46, 46, 46, 46, 46, 46, ...
Resampling results:
RMSE Rsquared MAE
7.738618 0.6128307 6.116021
Tuning parameter 'intercept' was held constant at a value of TRUE
The advantage of the LOOCV method is that we make use all data points reducing potential bias.
However, the process is repeated as many times as there are data points, resulting to a higher exection time when n is extreamely large.
Additionally, we test the model performance against one data, point at each iteration. This might result to higher variation in the prediction error, if some data points are outliers. So, we need a good ratio of testing data points, a solution provided by the k-fold cross-validation method.
The k-fold cross validation method evaluates the model performance on different subset of the training data and then calculate the average prediction error rate. The algorithm is as follow:
K-fold cross-validation (CV) is a robust method for estimating the accuracy of a model. The most obvious advantage of k-fold CV compared to LOOCV is computational. A less obvious but potentially more important advantage of k-fold CV is that it often fives more accurate estimates of the test error rate than does LOOCV
In practice, one typically performs k-fold cross-validation using k = 5 or k = 10, as these values have been shown empirically to yield test error rate estimates that suffer neither from excessively high bias nor from very high variance.
The following example uses 10-fold cross validation to estimate the prediction error. Make sure to set seed for reproducibility.
# Define training control
set.seed(123)
train.control <- trainControl(method = "cv", number = 10)
# Train the model
model.cv <- train(Fertility~., data = swiss, method = "lm", trControl = train.control)
# Summarize the results
print(model.cv)
Linear Regression
47 samples
5 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 43, 42, 42, 41, 43, 41, ...
Resampling results:
RMSE Rsquared MAE
7.379432 0.7512317 6.032731
Tuning parameter 'intercept' was held constant at a value of TRUE
The process of splitting the data into k-folds can be repeated a number of times, this is called repeated k-fold cross validation.
The bootstrap method is used to quantify the uncertainty with a given statistical estimator or with a predictive model.
It consists of randomly selecting a sample of n observations from the original data set. This subset, called bootstrap data set is then used to evaluate the model.
This procedure is repeated a large number of times and the standard error of the bootstrap estimate is then calculated. The results provide an indication of the variance of the model performance.
Note that, the sampling is performed with replacement, which means that the same observation can occur more than once in the boostrrap data set.
# Define training control
train.control.bootstrap <- trainControl(method = "boot", number = 100)
model.bootstrap <- train(Fertility ~., data = swiss, method = "lm", trControl = train.control.bootstrap)
# Summarize the results
print(model.bootstrap)
Linear Regression
47 samples
5 predictor
No pre-processing
Resampling: Bootstrapped (100 reps)
Summary of sample sizes: 47, 47, 47, 47, 47, 47, ...
Resampling results:
RMSE Rsquared MAE
8.432357 0.6048287 6.791066
Tuning parameter 'intercept' was held constant at a value of TRUE
The bootstrap approach can be used to quantify the uncertainty (or standard error) associated with any given statistical estimator.
For example, you might want to estimate the accuracy of the linear regression beta coefficients using bootstrap method.
The different steps are as follows:
We start by creating a function that retrns the regression model coefficeients:
model_coef <- function(data, index) {
coef(lm(Fertility~., data = data, subset = index))
}
model_coef(swiss, 1:47)
(Intercept) Agriculture Examination Education Catholic Infant.Mortality
66.9151817 -0.1721140 -0.2580082 -0.8709401 0.1041153 1.0770481
library(boot)
Attaching package: ‘boot’
The following object is masked from ‘package:lattice’:
melanoma
boot(swiss, model_coef, 500)
ORDINARY NONPARAMETRIC BOOTSTRAP
Call:
boot(data = swiss, statistic = model_coef, R = 500)
Bootstrap Statistics :
original bias std. error
t1* 66.9151817 0.572138874 10.84264891
t2* -0.1721140 -0.004654338 0.06456320
t3* -0.2580082 -0.027889510 0.25405701
t4* -0.8709401 0.002319556 0.21736743
t5* 0.1041153 -0.002166432 0.03067023
t6* 1.0770481 0.008581232 0.43205390
In the output above,
original column corresponds to the regression coefficients. The associated standard errors are given in the column std.error .
t1 corresponds to the intercept, t2 conrresponds to *Agriculture** and so on ..
For example, it can be observe that, the standard error (SE) of the regression coefficient associated with Agriculture is 0.06
Note that, the standard errors measure the variability/accuracy of the beta coefficients. It can be used to compute the confidence intervals of the coefficients.
For example, the 95% confidence interval for a given coefficient b is defined as b+/-1.96*SE(b), where:
That is, there is approximately a 95% chance that the interval[-0.306,-0.036] will contain the true value of the coefficient.
Using the standard lm() function gives a slightly different standard errors, because the linear model make some assumptions about the data:
summary(lm(Fertility ~., data = swiss))$coef
Estimate Std. Error t value Pr(>|t|)
(Intercept) 66.9151817 10.70603759 6.250229 1.906051e-07
Agriculture -0.1721140 0.07030392 -2.448142 1.872715e-02
Examination -0.2580082 0.25387820 -1.016268 3.154617e-01
Education -0.8709401 0.18302860 -4.758492 2.430605e-05
Catholic 0.1041153 0.03525785 2.952969 5.190079e-03
Infant.Mortality 1.0770481 0.38171965 2.821568 7.335715e-03
The bootstrap approach does not rely on any of these assumptions made by the linear model and so it is likely giving a more accurate estimate on the coefficients standard errors than tis the summary() function.
library(tidyverse)
library(caret)
library(leaps)
The R function regsubsets() [leaps package] can be used to identify different best models of different sizes.
You need to specify the option nvmax, which represents the maximum number of predictors to incorporate in the mode. For example, if nvmax = 5, the function will return up to the best 5-variables model, that is, it returns the best 1-varable model, the best 2-variables model, .., the best 5-variables models.
In our example, we have only 5 predictor variables in the data. So, we’ll use nvmax = 5
models.regsubsets <- regsubsets(Fertility~., data= swiss, nvmax = 5)
summary(models.regsubsets)
Subset selection object
Call: regsubsets.formula(Fertility ~ ., data = swiss, nvmax = 5)
5 Variables (and intercept)
Forced in Forced out
Agriculture FALSE FALSE
Examination FALSE FALSE
Education FALSE FALSE
Catholic FALSE FALSE
Infant.Mortality FALSE FALSE
1 subsets of each size up to 5
Selection Algorithm: exhaustive
Agriculture Examination Education Catholic Infant.Mortality
1 ( 1 ) " " " " "*" " " " "
2 ( 1 ) " " " " "*" "*" " "
3 ( 1 ) " " " " "*" "*" "*"
4 ( 1 ) "*" " " "*" "*" "*"
5 ( 1 ) "*" "*" "*" "*" "*"
The summary() function returns some metrics - Adjusted R2, Cp and BIC allowing us to identify the best overall model, where best is defined as the model that maximize the adjusted R2 and minimize the prediction errors
The adjusted R2 represents the proportion of variation, in the outcome, that are explained by variation in predictors values. The higher the adjusted R2, the better the model.
The best model, according to each of these metrics, can be extracted as follow:
res.sum <- summary(models.regsubsets)
data.frame(
Adj.R2 = which.max(res.sum$adjr2),
CP = which.min(res.sum$cp),
BIC = which.min(res.sum$bic)
)
NA
There is no single correct solution to model selection, each of these criteria will lead to slightly different models.
Remembert that,
“all models are wrong, some models are useful”
So, we have different “best” models depending on which metrics we consider. We need additonal strategies.
Note also that the adjusted R2, BIC and Cp are calculated on the training data that have been used to fit the model. This means that, the model selection, using these metrics, is possibly subject to overfitting and may not perform as well when applied to new data.
A more rigorous approach is to select a models based on the prediction error computed on a new test data usin k-fold cross-validation techniques
Here, we’ll follow the procedure below:
We start by defining two helper function:
#id: model id
#object: regsubsets object
#data: data used to fit regsubsets
get_model_formula <- function(id, object){
models <- summary(object)$which[id,-1]
#get outcome variable
form <- as.formula(object$call[[2]])
outcome <- all.vars(form)[1]
#Get model predictors
predictors <- names(which(models == TRUE))
predictors <- paste(predictors, collapse = "+")
# Build model formula
as.formula(paste(outcome, "~", predictors))
}
get_model_formula(3,models.regsubsets)
Fertility ~ Education + Catholic + Infant.Mortality
<environment: 0x7f9c38bcc868>
get_cv_error <- function (model.formula, data){
set.seed(1)
train.control <- trainControl(method = "cv", number = 5)
cv <- train(model.formula, data = data, method = "lm", trControl = train.control)
cv$results$RMSE
}
Use the above defined method to compute the prediction error
# Compute cross-validation error
model.ids <- 1:5
cv.errors <- map(model.ids, get_model_formula, models.regsubsets) %>%
map(get_cv_error, data = swiss) %>%
unlist()
cv.errors
[1] 9.422610 8.452344 7.926889 7.678508 7.923860
# Select the model that minimize the CV error
which.min(cv.errors)
[1] 4
It can be seen that the model with 4 variables is the best model. It has the lower prediction error. The regression coefficients of this model can be extracted as follow:
coef(models.regsubsets, 4)
(Intercept) Agriculture Education Catholic Infant.Mortality
62.1013116 -0.1546175 -0.9802638 0.1246664 1.0784422
Forward selection: Which starts with no predictors in the model, Iteratively adds the most contributive predictor, and stops when the improvement is no longer statistically significant.
Backwoard selection(or backward elimination): which starts with all predictors in the model (full model), iteratively removes the least contributive predictors, and stops when you have a model where all predictors are statistically significant.
Stepwise selection (or sequential replacement) which is a combination of forward and backward selections. you start with no predictors, then sequentially add the most contributive predictors (like forward selection). After adding each new variable, remove any variables that no longer provide and improvement in the model fit
library(tidyverse)
library(caret)
library(leaps)
library(MASS)
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
# Fit the full model
full.model <- lm(Fertility ~., data = swiss)
# Stepwise regression model
step.model <- stepAIC(full.model, direction = "both", trace = FALSE)
summary(step.model)
Call:
lm(formula = Fertility ~ Agriculture + Education + Catholic +
Infant.Mortality, data = swiss)
Residuals:
Min 1Q Median 3Q Max
-14.6765 -6.0522 0.7514 3.1664 16.1422
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 62.10131 9.60489 6.466 8.49e-08 ***
Agriculture -0.15462 0.06819 -2.267 0.02857 *
Education -0.98026 0.14814 -6.617 5.14e-08 ***
Catholic 0.12467 0.02889 4.315 9.50e-05 ***
Infant.Mortality 1.07844 0.38187 2.824 0.00722 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 7.168 on 42 degrees of freedom
Multiple R-squared: 0.6993, Adjusted R-squared: 0.6707
F-statistic: 24.42 on 4 and 42 DF, p-value: 1.717e-10
Ridge regression shrinks the regression coefficients, so that varibles, with minor contribution to the outcome, have their coefficients close to zero.
The shrinkage of the coefficients is achieved by penalizing the regression model with a penalty term called L2-norm, which is the sum of the squared coefficients.
The amount of the penalty can be fine-tuned using a constant called lambda . Selecting a good value for lambda is critical
When lambda = 0, the penalty term has no effect, and ridge regression will produce the calssical least square coefficients. However, as lambda increases to infinite, the impact of the shrinkage penalty grows, and the ridge regression coefficients will get close zero.
Note that, in contrast to the ordinary least square regression, ridge regression is highly affected by the scale of the predictors. Therfore, it is better to standardize (i.e., scale) the predictors before applying the ridge regression, so that all the predictors are on the same scale.
The standardization of a predictor x, can be achieved using the formula x` = x/sd(x), where sd(x) is the stadard deviation of . The consequence of this is that, all standardized predictors will have a standard deviation of one allowing the final fit to not depend on the scale on which the predictors are measured.
One important advantage of the ridge regression, is that it still perfroms will, compared to the ordinary least square method, in a situation where you have a large multivariate data with the number of predictors (p) larger than the number of observations (n).
One disadvantage of the ridge regression is that, it will included all the predictors in the final model, unlike the stepwise regression methods, which will genrally select models that involve a reduced set of variables.
Ridge regression shrinks the coefficients towards zero, but it will not set any of them exactly to zero. The lasso regression is an alternative that overcomes this drawback.
Lasso stands for Least Absolute Shrinkage and Selection Operator. It shrinks the regression coefficients toward zero by penalizing the regression model with a penalty term called L1-norm, which is the sum of the absolute coefficients.
In the case of lasso regression, the penalty has the effect of forcing some of the coefficient estimates, with a minor contribution to the model, to be ecactly equal to zero. This means that, lasso can be also seen as an alternative to the subset selection methods for performing variable selection in order to reduce the complexity of the model.
As in ridge regression, selecting a good value of lambda for the lasso is critical.
One obvious advantage of lasso regression over ridge regression, is that it produces simpler and more interpretable models that incorporate only a reduced set of the predictors. However, neither ridge regression nor the lasso will universally dominate the orther.
Generally, lasso mght perform better in a situation where some of the predictors have large coefficients, and the remaining predictors have very small coefficients.
Ridge regression will perform better when the outcome is a function of many predictors, all with coefficents of roughly equal size.
Cross-validation methods can be used for identifying which of these two techniques is better on a particular data set.
Elastic Net produces a regression model that is penalized with both the L1-norm and L2-norm. The consequence of this is to effectively shrink coefficents (Like in ridge regression) and to set some coefficients to zero (as in LASSO).
library(tidyverse)
library(caret)
library(glmnet)
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
Loading required package: foreach
Attaching package: ‘foreach’
The following objects are masked from ‘package:purrr’:
accumulate, when
Loaded glmnet 2.0-18
Preparing the data
We’ll use the Boston data set [in MASS package], for predicting the median house value(mdev), in Boston Suburbs, based on multiple predictor variables.
# Load the data
data("Boston", package = "MASS")
# Split the data into training and test set
set.seed(123)
boston.samples <- Boston$medv %>%
createDataPartition(p=0.8, list = FALSE)
train.boston.data <- Boston[boston.samples, ]
test.boston.data <- Boston[-boston.samples, ]
You need to create two objects:
x <- model.matrix(medv~., train.boston.data)[,-1]
# Outcome variable
y <- train.boston.data$medv
We’ll use the R function glmnet() fro computing penalized linear regression models.
glmnet(x,y, alpha = 1, lambda = NULL)
Call: glmnet(x = x, y = y, alpha = 1, lambda = NULL)
Df %Dev Lambda
[1,] 0 0.00000 6.908000
[2,] 1 0.09343 6.294000
[3,] 2 0.18670 5.735000
[4,] 2 0.26620 5.225000
[5,] 2 0.33220 4.761000
[6,] 2 0.38700 4.338000
[7,] 2 0.43240 3.953000
[8,] 2 0.47020 3.602000
[9,] 2 0.50150 3.282000
[10,] 3 0.53390 2.990000
[11,] 3 0.56100 2.725000
[12,] 3 0.58350 2.482000
[13,] 3 0.60210 2.262000
[14,] 3 0.61770 2.061000
[15,] 3 0.63050 1.878000
[16,] 3 0.64120 1.711000
[17,] 3 0.65010 1.559000
[18,] 3 0.65740 1.421000
[19,] 3 0.66360 1.294000
[20,] 4 0.66990 1.179000
[21,] 4 0.67560 1.075000
[22,] 4 0.68030 0.979100
[23,] 4 0.68420 0.892200
[24,] 4 0.68750 0.812900
[25,] 5 0.69080 0.740700
[26,] 5 0.69380 0.674900
[27,] 6 0.69730 0.614900
[28,] 7 0.70200 0.560300
[29,] 7 0.70610 0.510500
[30,] 8 0.70960 0.465200
[31,] 8 0.71430 0.423800
[32,] 8 0.71820 0.386200
[33,] 8 0.72140 0.351900
[34,] 10 0.72520 0.320600
[35,] 11 0.72850 0.292100
[36,] 11 0.73120 0.266200
[37,] 11 0.73350 0.242500
[38,] 11 0.73540 0.221000
[39,] 11 0.73690 0.201400
[40,] 11 0.73820 0.183500
[41,] 12 0.73950 0.167200
[42,] 12 0.74220 0.152300
[43,] 12 0.74440 0.138800
[44,] 12 0.74620 0.126500
[45,] 12 0.74760 0.115200
[46,] 12 0.74880 0.105000
[47,] 12 0.74990 0.095660
[48,] 12 0.75070 0.087160
[49,] 12 0.75140 0.079420
[50,] 12 0.75200 0.072370
[51,] 12 0.75250 0.065940
[52,] 12 0.75290 0.060080
[53,] 12 0.75320 0.054740
[54,] 12 0.75350 0.049880
[55,] 12 0.75370 0.045450
[56,] 12 0.75390 0.041410
[57,] 12 0.75410 0.037730
[58,] 12 0.75420 0.034380
[59,] 12 0.75430 0.031330
[60,] 12 0.75440 0.028540
[61,] 12 0.75450 0.026010
[62,] 12 0.75460 0.023700
[63,] 12 0.75460 0.021590
[64,] 12 0.75460 0.019670
[65,] 12 0.75470 0.017930
[66,] 12 0.75470 0.016330
[67,] 12 0.75470 0.014880
[68,] 12 0.75480 0.013560
[69,] 12 0.75480 0.012360
[70,] 12 0.75480 0.011260
[71,] 12 0.75480 0.010260
[72,] 12 0.75480 0.009346
[73,] 12 0.75480 0.008516
[74,] 12 0.75480 0.007760
In penalized regression, you need to specify a constant lambda to adjust the amount of the coefficent shrinkage. The best lambda for your data, can be defined as the lambda that minimize the cross-validation prediction error rate. This can be determined automatically using the function cv.glmnet().
In the following section, we start by computing ridge, lasso and elastic net regresion models. Next, we’ll compare the different models in order to choose the best one for our data The best model is defined as the model that has the lowest prediction error, RMSE
Computing ridge regression
# Find the best lambda using cross-validation
set.seed(123)
cv <- cv.glmnet(x,y, alpha = 0)
# Display the best lambda value
cv$lambda.min
[1] 0.6907672
# Fit the final model on the training data
model.ridge <- glmnet(x,y, alpha = 0, lambda = cv$lambda.min)
# Display regression coefficients
coef(model.ridge)
14 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 29.172942853
crim -0.073961766
zn 0.035006472
indus -0.055702961
chas 2.485565613
nox -11.449222575
rm 3.976849313
age -0.002944308
dis -1.221350102
rad 0.147920742
tax -0.006358355
ptratio -0.869860292
black 0.009399874
lstat -0.483051940
# Make predictions on the test data
x.test <- model.matrix(medv~., test.boston.data)[,-1]
predictions.ridge <- model.ridge %>% predict(x.test) %>% as.vector()
# Model performance metrics
data.frame(
RMSE = RMSE(predictions.ridge, test.boston.data$medv),
Rsquare = R2(predictions.ridge, test.boston.data$medv)
)
Note that by default, the function glmnet() standardizes variables so that their scales are comparable. However, the coefficients are always returned on the original scale.
Computing lasso regression
The only difference between the R code used for ridge regression is that for lasso regression you need to specify the argument alpha = 1 instead of alpha - 0
# Find the best lambda using cross-validation
set.seed(123)
cv <- cv.glmnet(x,y,alpha = 1)
# Display the best lambda value
cv$lambda.min
[1] 0.007759554
# Fit the final model on the training data
model <- glmnet(x,y, alpha = 1, lambda = cv$lambda.min)
#Display regression coefficients
coef(model)
14 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 36.962374270
crim -0.092549948
zn 0.048543171
indus -0.008321076
chas 2.287418592
nox -16.832483690
rm 3.810180182
age .
dis -1.598716155
rad 0.286797839
tax -0.012456750
ptratio -0.950997301
black 0.009652895
lstat -0.528739166
# Make predictions on the test data
x.laso.test <- model.matrix(medv ~., test.boston.data)[,-1]
predictions.lasso <- model %>% predict(x.test) %>% as.vector()
# Model performance metrics
data.frame(
RMSE = RMSE(predictions.lasso, test.boston.data$medv),
Rsquare = R2(predictions.lasso, test.boston.data$medv)
)
The elastic net regression can be easily computed using the caret workflow, which invokes the glmnet package.
We use caret to automatically select the best tuning parameters alpha and lambda. the caret packages tests a range of possible alpha and lambda values, then selects the best values fro lambda and alpha, resulting to a final modl that is an elastic net modle.
Here, we’ll test the combination of 10 different values for alpha and lambda. This is specified using the option tuneLength.
The best alpha and lambda values are those values that minimize the cross-validation error
# # Build the model using the training set
# set.seed(123)
# model.elastic <- train(
# medv ~., data = train.boston.data, method = "glmnet",
# trcontrol = trainControl("cv", number = 5),
# tuneLength = 10
# )
#
# # Best tuning parameter
# model.elastic$bestTune
The principal component regression(PCR) first applies Principal Component Analysis on the data set to summarize the original predictor variables into few new variables also known as principal component (PCs), which are a linear combination of the original data.
These PCs are then used to build the linear regression model. The number of principal components, to incorporate in the model, is chosen by corss-validation (cv). Note that, PCR is suitable when the data set contains highly correlated predictors.
A possible drawback of PCR is that we have no guarantee that the selected principal components are associated with the outcome. Here, the selection of the principal components to incorporate in the model is not supervised by the outcome variable.
An alternative to PCR is the Partial Least Squares (PLS) regression, which identifies new principal components that not only summarizes the original predictors, but also that are related to the outcome. These components are then used to fit the regression model. So compared PCR, PLS uses a dimension reduction strategy that is supervised by the outcome.
Like PCR, PLS is convenient for data with highly-correlated predictors. The number of PCs used in PLS is generally chosen by cross-validation. Predictors and the outcome variables should be generally standardized, to make the variables comparable.
library(tidyverse)
library(caret)
library(pls)
Attaching package: ‘pls’
The following object is masked from ‘package:caret’:
R2
The following object is masked from ‘package:stats’:
loadings
We’ll use the boston data set
# Build the model on training set
set.seed(123)
model.pc <- train(
medv~., data = train.boston.data, method = "pcr",
scale = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 10
)
# Plot model RMSE vs different values of components
plot(model.pc)
#Print the best tunning parameter ncomp that
# minimize the cross-validation error, RMSE
model.pc$bestTune
summary(model.pc$finalModel)
Data: X dimension: 407 13
Y dimension: 407 1
Fit method: svdpc
Number of components considered: 5
TRAINING: % variance explained
1 comps 2 comps 3 comps 4 comps 5 comps
X 47.48 58.40 68.00 74.75 80.94
.outcome 38.10 51.02 64.43 65.24 71.17
# Make predictions
predictions.pc <- model.pc %>% predict(test.boston.data)
# Model performance metrics
data.frame(
RMSE = caret::RMSE(predictions.pc, test.boston.data$medv),
Rsquare = caret::R2(predictions.pc, test.boston.data$medv)
)
The plot shows the prediction error made by the model acccording to the number of principal components incorporated in the model.
Our analysis shows that, choosing five principal components (ncomp = 5) gives the smallest prediction error RMSE.
The summary() function also provides the percentage of variance explained in the predictors(x) and in the outcome (medv) using different numbers of components.
For example, 80.94% of the variation (or information) contained in the predictors are captured by 5 components (ncomp = 5). Additionally, setting ncomp = 5, captures 71% of the information in the outcome variable (medv), which is good
Taken together, cross-validation identifies ncomp - 5 as the optimal number of PCs that minimize the prediction error (RMSE) and explains enough variation in the predictors and in the outcome
# Build the model on training set
set.seed(123)
model.pls <- train(
medv~., data = train.boston.data, method = "pls",
scale = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 10
)
# Plot model RMSE vs Different values of components
plot(model.pls)
model.pls$bestTune
summary(model.pls$finalModel)
Data: X dimension: 407 13
Y dimension: 407 1
Fit method: oscorespls
Number of components considered: 9
TRAINING: % variance explained
1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps 9 comps
X 46.19 57.32 64.15 69.76 75.63 78.66 82.85 85.92 90.36
.outcome 50.90 71.84 73.71 74.71 75.18 75.35 75.42 75.48 75.49
#Make predictions
predictions.pls <- model.pls %>% predict(test.boston.data)
# Model performance metrics
data.frame(
RMSE = caret::RMSE(predictions.pls, test.boston.data$medv),
Rsquare = caret::R2(predictions.pls, test.boston.data$medv)
)
The optimal number of principal components included in the PLS model is 9. This captures 90% of the variation in the predictors and 75% of the variation in the outcome variable (medv).
In our example, the cross-validation error RMSE obtained with the PLS model is lower than the RMSE obtained using the PCR method. So, the PLS model is the best model, for explaining our data, compaed to the PCR model.
In this part, we’ll comver the follwing topics:
Most of the classification algorithms computes the probability of belonging to a given class.
Observations are then assigned to the class that have the highest probability score.
Generaly, you need to decide a probability cutoff above which you consider the an observation as belonging to a given class.
Dataset will be using
The Pima Indian Diabetes data set is available in the mlbench package. It will be used for binary classification.
# Load the data set
data("PimaIndiansDiabetes2", package = "mlbench")
# Inspect the data
head(PimaIndiansDiabetes2,4)
str(PimaIndiansDiabetes2)
'data.frame': 768 obs. of 9 variables:
$ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
$ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
$ pressure: num 72 66 64 66 40 74 50 NA 70 96 ...
$ triceps : num 35 29 NA 23 35 NA 32 NA 45 NA ...
$ insulin : num NA NA NA 94 168 NA 88 NA 543 NA ...
$ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 NA ...
$ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
$ age : num 50 31 32 21 33 30 26 29 53 54 ...
$ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
The data contains 768 individuals (female) and 9 clinical variables for predictin the probability of individuals in being diabete-positive or negative:
| Column Names | Description |
|---|---|
| pregnant | Number of times pregnant |
| glucose | Plasma glucose concentration (glucose tolerance test) |
| pressure | Diastolic blood pressure (mm Hg) |
| triceps | Triceps skin fold thickness (mm) |
| insulin | 2-Hour serum insulin (mu U/ml) |
| mass | Body mass index (weight in kg/(height in m)^2) |
| pedigree | Diabetes pedigree function |
| age | Age (years) |
| diabetes | Class variable (test for diabetes) |
Performing the following steps might improve the accuracy of your model
set.seed(123)
training.Pima.samples <- PimaIndiansDiabetes2$diabetes %>%
createDataPartition(p = 0.8, list = FALSE)
train.pima.data <- PimaIndiansDiabetes2[training.Pima.samples, ]
test.pima.data <- PimaIndiansDiabetes2[-training.Pima.samples, ]
# Fit the model
model.lr.pima <- glm(diabetes~., data = train.pima.data, family = binomial)
# Summarize the model
summary(model.lr.pima)
Call:
glm(formula = diabetes ~ ., family = binomial, data = train.pima.data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6751 -0.6666 -0.3588 0.6581 2.5650
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -10.196902 1.369247 -7.447 9.54e-14 ***
pregnant 0.082032 0.061219 1.340 0.18025
glucose 0.036517 0.006381 5.723 1.05e-08 ***
pressure 0.001333 0.013053 0.102 0.91863
triceps 0.008425 0.018649 0.452 0.65145
insulin -0.001219 0.001441 -0.845 0.39784
mass 0.081434 0.030448 2.675 0.00748 **
pedigree 1.186528 0.470790 2.520 0.01173 *
age 0.030886 0.020337 1.519 0.12884
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 402.38 on 314 degrees of freedom
Residual deviance: 280.83 on 306 degrees of freedom
(300 observations deleted due to missingness)
AIC: 298.83
Number of Fisher Scoring iterations: 5
## Make predictions
probabilities <- model.lr.pima %>% predict(test.pima.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
# predicted.classes
# Model accuracy
mean(predicted.classes == test.pima.data$diabetes)
[1] NA
The simple logistic regression is used to predict the probability of class membership based on one single predictor variable.
The following R code builds a model to predict the probability of being diabetes-positive based on the plasma glucose concentration:
model.logit.simple <- glm(diabetes ~ glucose, data = train.pima.data, family = binomial)
summary(model.logit.simple)
Call:
glm(formula = diabetes ~ glucose, family = binomial, data = train.pima.data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.1322 -0.7882 -0.5157 0.8641 2.2706
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.555585 0.482107 -11.52 <2e-16 ***
glucose 0.039188 0.003697 10.60 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 790.12 on 610 degrees of freedom
Residual deviance: 637.56 on 609 degrees of freedom
(4 observations deleted due to missingness)
AIC: 641.56
Number of Fisher Scoring iterations: 4
The output above shows the estimate of the regression beta coefficients and their significance levels. The intercept (b0) is -5.55 and the coefficient of glucose variable is 0.039.
The logistic equation can be written as p = exp(-5.55 + 0.039*glucose)/[1+exp(-5.55+0.039*glucose)]. Using this formula, for each new glucose plasma concentration value, you can predict the probability of the individuals in bein diabetes positive.
Predictions can be easily made using the function predict(). Use the option type = “response” to directly obtain the probabilities
newdata <- data.frame(glucose = c(20,190))
probabilities <- model.logit.simple %>% predict(newdata, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
predicted.classes
1 2
"neg" "pos"
train.pima.data %>%
mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>%
ggplot(aes(glucose,prob)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "glm", method.args = list(family = "binomial"))+
labs(
title = "Simple Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabetes-pos"
)
plot(model.logit.simple)
Stepwise logistic regression consists of automatically selecting a reduced number of predictor variables for building the best performing logistic regression model.
Data set: PimaIndiansDiabetes2
Quick start R code
library(MASS)
# Fit the model
removed.missing.data <- na.omit(train.pima.data)
model <- glm(diabetes ~ ., data = removed.missing.data , family = binomial) %>% stepAIC(trace = FALSE)
summary(model)
Call:
glm(formula = diabetes ~ glucose + mass + pedigree + age, family = binomial,
data = removed.missing.data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.8024 -0.6631 -0.3716 0.6617 2.5631
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -10.081719 1.202909 -8.381 < 2e-16 ***
glucose 0.033770 0.005536 6.101 1.06e-09 ***
mass 0.083808 0.022726 3.688 0.000226 ***
pedigree 1.110791 0.456960 2.431 0.015064 *
age 0.051063 0.014602 3.497 0.000471 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 402.38 on 314 degrees of freedom
Residual deviance: 283.63 on 310 degrees of freedom
AIC: 293.63
Number of Fisher Scoring iterations: 5
removed.missing.data.from.test <- na.omit(test.pima.data)
probabilities.step <- model %>% predict(removed.missing.data.from.test, type = "response")
predicted.classes.step.logit <- ifelse(probabilities.step > 0.5, "pos", "neg")
#Model Accuracy
mean(predicted.classes.step.logit == removed.missing.data.from.test$diabetes)
[1] 0.7922078
df <- data.frame("Predicted" = predicted.classes.step.logit)
(df$Predicted == test.pima.data$diabetes)
longer object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object length
[1] TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE
[22] TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
[43] FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
[64] TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE
[85] FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
[106] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
[127] TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
[148] FALSE TRUE FALSE TRUE FALSE TRUE
Penalized logistic regression imposes a penalty to the logistic model for having too many variables. This results in shrinking the coefficients of the less contributive variables toward zero. This is also know as regularization. The most commonly used penalized regression include
Ridge regression: variables with minor contribution have their coefficients close to zero. However, all the variables are incorporated in the model. This is useful when all variables need to be incorporated in the model according to domain knowledge.
lasso regression: the coefficients of some less contributive variables aer forced to be exactly zero. Only the most significant variables ae kept in the final model.
elastic net regression: the combination of ridge and lasso regression. It shrinks some coefficients toward zero (like ridge regerssion) and set some coefficients to exactly zero (like lasso regression)
Required packages
library(tidyverse)
library(caret)
library(glmnet)
Preparing the data
Data set: PimaIndiansDiabetes2
# Load the data and remove NAs
Pima.Indians.data <- na.omit(PimaIndiansDiabetes2)
# Inspect the data
sample_n(Pima.Indians.data, 3)
# Split the data into training and test set
set.seed(123)
training.samples <- Pima.Indians.data$diabetes %>%
createDataPartition(p = 0.8, list = FALSE)
train.pima.indians <- Pima.Indians.data[training.samples, ]
test.pima.indians <- Pima.Indians.data[-training.samples, ]
Additional data preparation
The R function model.matrix() help to create the matrix of predictors and also automatically converts categorical predictors to appropriate dummy variables, which is required for the glmnet() function.
# Dummy code categorical predictor variables
x <- model.matrix(diabetes ~., train.pima.indians)[,-1]
# Convert the outcome (class) to a numerical variable
y <- ifelse(train.pima.indians$diabetes == "pos", 1, 0)
We’ll use the R function glmnet() for computing penalized logistic regression.
The simplified format is as follow:
library(glmnet)
glmnet(x, y, family = "binomial", alpha = 1, lambda = NULL)
Call: glmnet(x = x, y = y, family = "binomial", alpha = 1, lambda = NULL)
Df %Dev Lambda
[1,] 0 3.147e-15 0.2480000
[2,] 1 3.690e-02 0.2259000
[3,] 1 6.743e-02 0.2059000
[4,] 1 9.292e-02 0.1876000
[5,] 1 1.144e-01 0.1709000
[6,] 1 1.325e-01 0.1557000
[7,] 1 1.479e-01 0.1419000
[8,] 1 1.610e-01 0.1293000
[9,] 1 1.722e-01 0.1178000
[10,] 2 1.851e-01 0.1073000
[11,] 2 1.970e-01 0.0978000
[12,] 2 2.071e-01 0.0891100
[13,] 2 2.158e-01 0.0812000
[14,] 2 2.233e-01 0.0739800
[15,] 3 2.314e-01 0.0674100
[16,] 5 2.400e-01 0.0614200
[17,] 5 2.484e-01 0.0559600
[18,] 5 2.558e-01 0.0509900
[19,] 5 2.620e-01 0.0464600
[20,] 5 2.674e-01 0.0423400
[21,] 5 2.721e-01 0.0385700
[22,] 6 2.762e-01 0.0351500
[23,] 6 2.798e-01 0.0320300
[24,] 6 2.829e-01 0.0291800
[25,] 6 2.856e-01 0.0265900
[26,] 6 2.879e-01 0.0242300
[27,] 6 2.898e-01 0.0220700
[28,] 6 2.915e-01 0.0201100
[29,] 6 2.929e-01 0.0183300
[30,] 6 2.941e-01 0.0167000
[31,] 6 2.951e-01 0.0152100
[32,] 6 2.959e-01 0.0138600
[33,] 6 2.967e-01 0.0126300
[34,] 7 2.975e-01 0.0115100
[35,] 7 2.984e-01 0.0104900
[36,] 7 2.993e-01 0.0095550
[37,] 7 2.999e-01 0.0087060
[38,] 7 3.005e-01 0.0079330
[39,] 7 3.010e-01 0.0072280
[40,] 7 3.014e-01 0.0065860
[41,] 7 3.018e-01 0.0060010
[42,] 8 3.022e-01 0.0054680
[43,] 8 3.025e-01 0.0049820
[44,] 8 3.028e-01 0.0045390
[45,] 8 3.030e-01 0.0041360
[46,] 8 3.032e-01 0.0037690
[47,] 8 3.034e-01 0.0034340
[48,] 8 3.035e-01 0.0031290
[49,] 8 3.037e-01 0.0028510
[50,] 8 3.038e-01 0.0025980
[51,] 8 3.038e-01 0.0023670
[52,] 8 3.039e-01 0.0021570
[53,] 8 3.040e-01 0.0019650
[54,] 8 3.040e-01 0.0017900
[55,] 8 3.040e-01 0.0016310
[56,] 8 3.041e-01 0.0014860
[57,] 8 3.041e-01 0.0013540
[58,] 8 3.041e-01 0.0012340
[59,] 8 3.042e-01 0.0011240
[60,] 8 3.042e-01 0.0010250
[61,] 8 3.042e-01 0.0009336
[62,] 8 3.042e-01 0.0008506
[63,] 8 3.042e-01 0.0007750
set.seed(123)
cv.lasso <- cv.glmnet(x,y, alpha = 1, family = "binomial")
# cv.lasso
# summary(cv.lasso)
# Fit the final model on the training data
model.logit.lasso <- glmnet(x,y, alpha = 1, family = "binomial", lambda = cv.lasso$lambda.min)
# Display regression coefficients
coef(model.logit.lasso)
9 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) -8.6158632778
pregnant 0.0350263984
glucose 0.0369158677
pressure .
triceps 0.0164757684
insulin -0.0003928031
mass 0.0304940618
pedigree 0.7854910473
age 0.0362782651
# Make predictions on the test data
x.test <- model.matrix(diabetes~., test.pima.indians)[,-1]
probabilities.logit.lasso <- model.logit.lasso %>% predict(newx = x.test)
predict.classess.logit.lasso <- ifelse(probabilities.logit.lasso > 0.5, "pos", "neg")
#Model accuracy
observed.classes <- test.pima.indians$diabetes
mean(predict.classess.logit.lasso == observed.classes)
[1] 0.7692308
plot(cv.lasso)
The plot displays the cross-validation error according to the log of lambda. The left dashed vertical line indicates that the log of the optimal value of lambda is approximately -5, which is the one that minimizes the prediction error. This lambda value will give the most accurate model. The exact value of lambda can be viewed as follow:
cv.lasso$lambda.min
[1] 0.008706319
cv.lasso$lambda.1se
[1] 0.06740987
Using lambda.min as the best lambda, gives the following regression coefficients:
coef(cv.lasso, cv.lasso$lambda.min)
9 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) -8.6156146833
pregnant 0.0350758913
glucose 0.0369156360
pressure .
triceps 0.0164842605
insulin -0.0003924262
mass 0.0304848446
pedigree 0.7855063693
age 0.0362650459
coef(cv.lasso, cv.lasso$lambda.1se)
9 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) -4.657500725
pregnant .
glucose 0.026284471
pressure .
triceps 0.001905497
insulin .
mass .
pedigree .
age 0.017338402
The logistic regression method assumes that:
The outcome is a binary or dichotomous variable like yes vs no, positive vs negative, 1 vs 0.
there is alineare relationship between the logit of the outcome and each predictor variables. Recall that the outcome and each predictor variables. Recall that the logit function is logit(p) = log(p/(1-p)), where p is the probabilities of the outcome
There is no influential value (extreame values or outliers) in the continuous predictors
There is no high intercorrelation (i.e. multicollinearity) among the predictors.
Loading reequired R packages
library(tidyverse)
library(broom)
theme_set(theme_classic())
PimaIndiansDiabetes2.cleaned <- na.omit(PimaIndiansDiabetes2)
# Fit the logistic regression model
model.pima.logit2 <- glm(diabetes~., data = PimaIndiansDiabetes2.cleaned, family = binomial)
# Predict the probability (p) of diiabete positivity
probabilities.pima.logit <- predict(model.pima.logit2, type = "response")
predicted.pima.classes <- ifelse(probabilities.pima.logit > 0.5, "pos", "neg")
head(predicted.pima.classes)
4 5 7 9 14 15
"neg" "pos" "neg" "pos" "pos" "pos"
# train.pima.indians
mydata <- PimaIndiansDiabetes2.cleaned %>%
dplyr::select_if(is.numeric)
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
model.pima.logit <- mydata %>%
mutate(logit = log(probabilities.pima.logit/(1-probabilities.pima.logit))) %>%
gather(key = "predictors", value = "predictor.value", -logit)
ggplot(model.pima.logit, aes(logit, predictor.value)) +
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
Influential values are extreme individual data points that can alter the quaity of the logistic regression model. The most extreme values in the data can be examined by visualizing the Cook’s distance values.
Here we label the top 3 largest values:
plot(model.pima.logit2, which = 4, id.n = 3)
Note that, not all outliers are influential observations. To check kwhether the data contains potential influential observations, the standardized residual error can be inspected. Data points with an absolute standardized residuals above 3 represent possible outliers and may deserve closer attention.
The following R code computes the standardized residuals (.std.resid) and the Cook’s distance (.cooksd) using the R function augment()[broom package].
# Extract model results
model.pima.logit.data <- augment(model.pima.logit2) %>%
mutate(index = 1:n())
The data for the top 3 largest values, according to the Cook’s distance, can be displayed as follow:
model.pima.logit.data %>% top_n(3, .cooksd)
NA
Plot the standardized residuals:
ggplot(model.pima.logit.data, aes(index, .std.resid)) +
geom_point(aes(color = diabetes), alpha = 0.5) +
theme_bw()
Filter potential influential data points with abs(.std.res) > 3
model.pima.logit.data %>% filter(abs(.std.resid) > 3)
There is no influential observation in our data.
When we have outliers in a continuous predictor, potential solutions include:
Multicollinearity corresponds to a situation where the data contain highly correlated predictor variables.
Multicollinearity is an important issue in regression analysis and should be fixed by removing the concerned variables. It can be assessed using the R function vif()
car::vif(model.pima.logit2)
pregnant glucose pressure triceps insulin mass pedigree age
1.892387 1.378937 1.191287 1.638865 1.384038 1.832416 1.031715 1.974053
As a rule of thumb, a VIF value that exceeds 5 or 10 indicates a problematic amount of collinearity. In our example there is no collinearity: all variables have a value of VIF well below 5.
The multinomial logistic regressin is an extension of the logistic regression for multclass classifcation tasks. It is used when the outcome involves more than two classes.
Loading required R packages
library(tidyverse)
library(caret)
library(nnet)
Attaching package: ‘nnet’
The following object is masked from ‘package:mgcv’:
multinom
We’ll use the iris data set
# Load the data
data("iris")
# Inspect the data
sample_n(iris,3)
# Split the data into training and test set
set.seed(123)
training.samples.iris <- iris$Species %>% createDataPartition(p=0.8, list = FALSE)
train.iris.data <- iris[training.samples.iris,]
test.iris.data <- iris[-training.samples.iris,]
# Fit the model
model <- nnet::multinom(Species ~., data = train.iris.data)
# weights: 18 (10 variable)
initial value 131.833475
iter 10 value 13.707358
iter 20 value 5.548255
iter 30 value 5.196272
iter 40 value 4.985881
iter 50 value 4.978698
iter 60 value 4.970034
iter 70 value 4.968625
iter 80 value 4.967145
iter 90 value 4.967125
iter 100 value 4.967097
final value 4.967097
stopped after 100 iterations
# Summarize the model
summary(model)
Call:
nnet::multinom(formula = Species ~ ., data = train.iris.data)
Coefficients:
(Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
versicolor 17.29267 -5.643165 -8.391525 14.61331 -2.091772
virginica -21.99694 -6.650591 -14.538985 22.01169 14.158731
Std. Errors:
(Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
versicolor 43.11205 119.7240 198.5332 91.66177 59.26249
virginica 43.68199 119.7252 198.5908 91.81816 59.59031
Residual Deviance: 9.934194
AIC: 29.93419
# Make predictions
predicted.species <- model %>% predict(test.iris.data)
head(predicted.species)
[1] setosa setosa setosa setosa setosa setosa
Levels: setosa versicolor virginica
# Model accuracy
mean(predicted.species == test.iris.data$Species)
[1] 0.9666667
The following discriminant analysis methods will be described:
Linear discriminant analysis (LDA): Uses linear combinations of predictors to predict the class of a given observation. Assumes that the predictor variables (p) are normally distributed and the classes have identical variances (for univariate analysis, p = 1) or identical covariance matrices (for multvariate analysis, p > 1).
Quadratic discriminant analysis (QDA): More flexible than LDA. Here, there is no assumption that the covariance matrix of classes is the same
Mixture discriminant analysis (MDA): Each class is assumed to be a Gaussian mixture of subclasses.
Flexible Discriminant Analysis (FDA): Non-linear combination of predictors is used such as splines.
Regularized discriminant analysis (RDA): Regularization (or shrinkage) improves the estimate of the covariance matrices in situation where the number of predictors is larger than the number of samples in the training data. This leads to an improvement of the discriminant analysis.
library(tidyverse)
library(caret)
theme_set(theme_classic())
# Lodd the data
data("iris")
# Split the data into training and test set
set.seed(123)
training.samples.iris <- iris$Species %>%
createDataPartition(p=0.8, list = FALSE)
train.data.iris <- iris[training.samples.iris, ]
test.data.iris <- iris[-training.samples.iris, ]
# Estimate preprocessing parameters
preproc.pram <- train.data.iris %>%
preProcess(method = c("center", "scale"))
# Transform the data using the estimated parameters
train.transformed <- preproc.pram %>% predict(train.data.iris)
test.transformed <- preproc.pram %>% predict(test.data.iris)
Before performing LDA, consider: * Inspecting the univariate distribution of each variable and make sure that they are normally distribute. If not, you can transform them using log and root for exponential distributions and Box-Cox for skewed distributions.
*removing outliers from your data and standardize the variables to make their scale comparable.
library(MASS)
# Fit the model
model.lda <- lda(Species ~., data = train.transformed)
# Make predictions
predictions <- model.lda %>% predict(test.transformed)
# Model accuracy
mean(predictions$class == test.transformed$Species)
[1] 1
model.lda
Call:
lda(Species ~ ., data = train.transformed)
Prior probabilities of groups:
setosa versicolor virginica
0.3333333 0.3333333 0.3333333
Group means:
Sepal.Length Sepal.Width Petal.Length Petal.Width
setosa -1.0120728 0.7867793 -1.2927218 -1.2496079
versicolor 0.1174121 -0.6478157 0.2724253 0.1541511
virginica 0.8946607 -0.1389636 1.0202965 1.0954568
Coefficients of linear discriminants:
LD1 LD2
Sepal.Length 0.9108023 0.03183011
Sepal.Width 0.6477657 0.89852536
Petal.Length -4.0816032 -2.22724052
Petal.Width -2.3128276 2.65441936
Proportion of trace:
LD1 LD2
0.9905 0.0095
summary(model.lda)
Length Class Mode
prior 3 -none- numeric
counts 3 -none- numeric
means 12 -none- numeric
scaling 8 -none- numeric
lev 3 -none- character
svd 2 -none- numeric
N 1 -none- numeric
call 3 -none- call
terms 3 terms call
xlevels 0 -none- list
LDA determines group means and computes, for each individual, the probability of belonging to the different groups. The individual is then affected to the group with the highest probability score.
The lda() outputs contain the following elements: * prior probabilities of groups: the proportion of training observation in each group. For example, there are 33% of the training observations in the setosa group * Group means: group center of gravity. Shows the mean of each variable in each group. * Coefficients of linear discriminants: Shows the linear combination of predictor variables that are used to form the LDA decisionrule. For example
LD1 = 0.01*Sepal.Length + 0.64*Sepal.Width - 4.08*Petal.Length = 2.3*Petal.Width LD2 = 0.03*Sepal.Length + 0.89*Sepal.Width - 2.2*Petal.Length - 2.6*Petal.Width
Using the function Plot() produces plots of the linear discriminants, obtained by computing LD1 and LD2 for each of the training observations
plot(model.lda)
lda.data <- cbind(train.transformed, predict(model.lda)$x)
ggplot(lda.data, aes(LD1, LD2)) +
geom_point(aes(color = Species))
It can be seen that, our model correctly classified 100% of observations, which is excellent.
Note that, by default, the probability cutoff used to decide group-membership is 0.5
In some situation, you might want to increase the precision of the model. In this case you can fine-tune the model by adjusting the posterior probability cutoff. For example, you can increase or lower the cutoff
Note that, if the predictor variables are standardized before computing LDA, the discriminator weights can be used as measures of variable importance for feature selection
QDA is little bit more flexible than LDA, in the sense that it does not assumes the equality of variance/covariance. In other words, for QDA the covariance matrix can be different for each class.
LDA tends to be a better than QDA then you have a small training set.
In contrast, QDA is recommended if the training set is very large, so that the variance of the classifier is not a major issue, or if the assumption of a common covariance matrix for the K classes if clearly untenable
QDA can be computed using the R function qda() [MASS package]
library(MASS)
#Fit the model
model.qda <- qda(Species~., data = train.transformed)
model.qda
Call:
qda(Species ~ ., data = train.transformed)
Prior probabilities of groups:
setosa versicolor virginica
0.3333333 0.3333333 0.3333333
Group means:
Sepal.Length Sepal.Width Petal.Length Petal.Width
setosa -1.0120728 0.7867793 -1.2927218 -1.2496079
versicolor 0.1174121 -0.6478157 0.2724253 0.1541511
virginica 0.8946607 -0.1389636 1.0202965 1.0954568
# Make predictions
predictions.qda <- model.qda %>% predict(test.transformed)
# Model accuracy
mean(predictions.qda$class == test.transformed$Species)
[1] 1
The LDA classifier assumes that each class comes from a single normal (or Gaussian ) distribution.
This is too restrictive.
For MDA, ther are classes, and each class is assumed to be a Gaussian mixture of subclasses, where each data point has a probability of belonging to each class. Equality of covariance matrix, among classes, is still assumed
library(mda)
Loading required package: class
Loaded mda 0.4-10
model.mda <- mda(Species~., data = train.transformed)
model.mda
Call:
mda(formula = Species ~ ., data = train.transformed)
Dimension: 5
Percent Between-Group Variance Explained:
v1 v2 v3 v4 v5
96.77 99.63 99.86 100.00 100.00
Degrees of Freedom (per dimension): 5
Training Misclassification Error: 0.025 ( N = 120 )
Deviance: 14.005
# Make predictions
predicted.classes.mda <- model.mda %>% predict(test.transformed)
# Model accuracy
mean(predicted.classes.mda == test.transformed$Species)
[1] 1
FDA is a flexible extension of LDA that uses non-linear combinations of predictors such as splines. FDA is useful to model multivariate non-normality or non-linear relationships among variables within each group, alowing for a more accurate classification
library(mda)
# Fit the model
model.fda <- fda(Species~., data = train.transformed)
model.fda
Call:
fda(formula = Species ~ ., data = train.transformed)
Dimension: 2
Percent Between-Group Variance Explained:
v1 v2
99.05 100.00
Degrees of Freedom (per dimension): 5
Training Misclassification Error: 0.025 ( N = 120 )
predicted.classes.fda <- model.fda %>% predict(test.transformed)
#Model accuracy
mean(predicted.classes.fda == test.transformed$Species)
[1] 1
RDA builds a classification rule by regularizing the group covariance matrices allowing a more robust model against multicollinearity in the data. This might be very useful for a large multivariate data set containing highly correlated predictors.
Regularized discriminant analysis is a kind of a trade-off between LDA and QDA. Recall that, in LDA we assume equality of covariance matrix for all of the classes. QDA assumes different covariance matrices for all the classes. QDa assumes different covariance matrices for all the classes. Regularized discriminant analysis is an intermediate between LDA and QDA.
RDA shrinks the separte covariances of QDA toward a common covariance as in LDA. This improves the estimate of the covariance matrices in situations where the number of predictors is larger thant the number of samples in the training data, potentially leading to an improvement of the model accuracy.
library(klaR)
# Fit the model
model.rda <- rda(Species~., data = train.transformed)
# Make predictions
predictions.rda <- model.rda %>% predict(test.transformed)
# Model accuracy
mean(predictions.rda$class == test.transformed$Species)
[1] 1
The Naive Bayes Classifier is a simple and powerful method that can be used for binary and multiclass classification problems.
Naive Bayes classifier predicts the class membership probability of observations using Bayes theorem, which is based on conditional probability, that is the probability of something to happen, given that something else has already occured.
Will use PimaIndiansDiabetes2 data set
library(klaR)
# Fit the model
model.naive <- NaiveBayes(diabetes ~., data = train.pima.indians)
model.naive
$apriori
grouping
neg pos
0.6687898 0.3312102
$tables
$tables$pregnant
[,1] [,2]
neg 2.890476 2.645282
pos 4.471154 3.968215
$tables$glucose
[,1] [,2]
neg 112.1381 25.23393
pos 147.1635 29.39590
$tables$pressure
[,1] [,2]
neg 69.19048 12.13173
pos 73.54808 13.71264
$tables$triceps
[,1] [,2]
neg 27.83333 10.668822
pos 32.68269 9.539152
$tables$insulin
[,1] [,2]
neg 137.0571 110.5755
pos 214.2115 140.5018
$tables$mass
[,1] [,2]
neg 32.11524 6.905892
pos 35.42500 6.825278
$tables$pedigree
[,1] [,2]
neg 0.4699952 0.3091545
pos 0.6139904 0.3853125
$tables$age
[,1] [,2]
neg 28.6619 9.085715
pos 35.8750 10.248076
$levels
[1] "neg" "pos"
$call
NaiveBayes.default(x = X, grouping = Y)
$x
pregnant glucose pressure triceps insulin mass pedigree age
4 1 89 66 23 94 28.1 0.167 21
5 0 137 40 35 168 43.1 2.288 33
7 3 78 50 32 88 31.0 0.248 26
9 2 197 70 45 543 30.5 0.158 53
14 1 189 60 23 846 30.1 0.398 59
15 5 166 72 19 175 25.8 0.587 51
17 0 118 84 47 230 45.8 0.551 31
19 1 103 30 38 83 43.3 0.183 33
20 1 115 70 30 96 34.6 0.529 32
26 10 125 70 26 115 31.1 0.205 41
33 3 88 58 11 54 24.8 0.267 22
44 9 171 110 24 240 45.4 0.721 54
51 1 103 80 11 82 19.4 0.491 22
52 1 101 50 15 36 24.2 0.526 26
53 5 88 66 21 23 24.4 0.342 30
54 8 176 90 34 300 33.7 0.467 58
55 7 150 66 42 342 34.7 0.718 42
57 7 187 68 39 304 37.7 0.254 41
58 0 100 88 60 110 46.8 0.962 31
64 2 141 58 34 128 25.4 0.699 24
69 1 95 66 13 38 19.6 0.334 25
70 4 146 85 27 100 28.9 0.189 27
71 2 100 66 20 90 32.9 0.867 28
74 4 129 86 20 270 35.1 0.231 23
83 7 83 78 26 71 29.3 0.767 36
86 2 110 74 29 125 32.4 0.698 27
88 2 100 68 25 71 38.5 0.324 26
89 15 136 70 32 110 37.1 0.153 43
92 4 123 80 15 176 32.0 0.443 34
93 7 81 78 40 48 46.7 0.261 42
95 2 142 82 18 64 24.7 0.761 21
96 6 144 72 27 228 33.9 0.255 40
98 1 71 48 18 76 20.4 0.323 22
99 6 93 50 30 64 28.7 0.356 23
100 1 122 90 51 220 49.7 0.325 31
104 1 81 72 18 40 26.6 0.283 24
108 4 144 58 28 140 29.5 0.287 37
109 3 83 58 31 18 34.3 0.336 25
112 8 155 62 26 495 34.0 0.543 46
113 1 89 76 34 37 31.2 0.192 23
115 7 160 54 32 175 30.5 0.588 39
120 4 99 76 15 51 23.2 0.223 21
121 0 162 76 56 100 53.2 0.759 25
123 2 107 74 30 100 33.6 0.404 23
126 1 88 30 42 99 55.0 0.496 26
127 3 120 70 30 135 42.9 0.452 30
128 1 118 58 36 94 33.3 0.261 23
129 1 117 88 24 145 34.5 0.403 40
131 4 173 70 14 168 29.7 0.361 33
133 3 170 64 37 225 34.5 0.356 30
135 2 96 68 13 49 21.1 0.647 26
136 2 125 60 20 140 33.8 0.088 31
140 5 105 72 29 325 36.9 0.159 28
143 2 108 52 26 63 32.5 0.318 22
145 4 154 62 31 284 32.8 0.237 23
148 2 106 64 35 119 30.5 1.400 34
151 1 136 74 50 204 37.4 0.399 24
153 9 156 86 28 155 34.3 1.189 42
154 1 153 82 42 485 40.6 0.687 23
158 1 109 56 21 135 25.2 0.833 23
159 2 88 74 19 53 29.0 0.229 22
160 17 163 72 41 114 40.9 0.817 47
162 7 102 74 40 105 37.2 0.204 45
163 0 114 80 34 285 44.2 0.167 27
172 6 134 70 23 130 35.4 0.542 29
174 1 79 60 42 48 43.5 0.678 23
175 2 75 64 24 55 29.7 0.370 33
178 0 129 110 46 130 67.1 0.319 26
182 0 119 64 18 92 34.9 0.725 23
187 8 181 68 36 495 30.1 0.615 60
188 1 128 98 41 58 32.0 1.321 33
189 8 109 76 39 114 27.9 0.640 31
190 5 139 80 35 160 31.6 0.361 25
192 9 123 70 44 94 33.1 0.374 40
196 5 158 84 41 210 39.4 0.395 29
198 3 107 62 13 48 22.9 0.678 23
199 4 109 64 44 99 34.8 0.905 26
200 4 148 60 27 318 30.9 0.150 29
204 2 99 70 16 44 20.4 0.235 27
205 6 103 72 32 190 37.7 0.324 55
214 0 140 65 26 130 42.6 0.431 24
215 9 112 82 32 175 34.2 0.260 36
216 12 151 70 40 271 41.8 0.742 38
218 6 125 68 30 120 30.0 0.464 32
221 0 177 60 29 478 34.6 1.072 21
225 1 100 66 15 56 23.6 0.666 26
226 1 87 78 27 32 34.6 0.101 22
229 4 197 70 39 744 36.7 2.329 31
230 0 117 80 31 53 45.2 0.089 24
232 6 134 80 37 370 46.2 0.238 46
233 1 79 80 25 37 25.4 0.583 22
235 3 74 68 28 45 29.7 0.293 23
237 7 181 84 21 192 35.9 0.586 51
242 4 91 70 32 88 33.1 0.446 22
244 6 119 50 22 176 27.1 1.318 33
245 2 146 76 35 194 38.2 0.329 29
248 0 165 90 33 680 52.3 0.427 23
249 9 124 70 33 402 35.4 0.282 34
255 12 92 62 7 258 27.6 0.926 44
259 1 193 50 16 375 25.9 0.655 24
260 11 155 76 28 150 33.3 1.353 51
261 3 191 68 15 130 30.9 0.299 34
266 5 96 74 18 67 33.6 0.997 43
272 2 108 62 32 56 25.2 0.128 21
274 1 71 78 50 45 33.2 0.422 21
276 2 100 70 52 57 40.5 0.677 25
280 2 108 62 10 278 25.3 0.881 22
282 10 129 76 28 122 35.9 0.280 39
283 7 133 88 15 155 32.4 0.262 37
286 7 136 74 26 135 26.0 0.647 51
287 5 155 84 44 545 38.7 0.619 34
290 5 108 72 43 75 36.1 0.263 33
291 0 78 88 29 40 36.9 0.434 21
292 0 107 62 30 74 36.6 0.757 25
293 2 128 78 37 182 43.3 1.224 31
294 1 128 48 45 194 40.5 0.613 24
296 6 151 62 31 120 35.5 0.692 28
297 2 146 70 38 360 28.0 0.337 29
298 0 126 84 29 215 30.7 0.520 24
306 2 120 76 37 105 39.7 0.215 29
307 10 161 68 23 132 25.5 0.326 47
308 0 137 68 14 148 24.8 0.143 21
310 2 124 68 28 205 32.9 0.875 30
312 0 106 70 37 148 39.4 0.605 22
313 2 155 74 17 96 26.6 0.433 27
[ reached 'max' / getOption("max.print") -- omitted 189 rows ]
$usekernel
[1] FALSE
$varnames
[1] "pregnant" "glucose" "pressure" "triceps" "insulin" "mass" "pedigree" "age"
attr(,"class")
[1] "NaiveBayes"
# Make predictions
prediction.naive <- model.naive %>% predict(test.pima.indians)
Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 33Numerical 0 probability for all classes with observation 34Numerical 0 probability for all classes with observation 35Numerical 0 probability for all classes with observation 36Numerical 0 probability for all classes with observation 37Numerical 0 probability for all classes with observation 38Numerical 0 probability for all classes with observation 39Numerical 0 probability for all classes with observation 40Numerical 0 probability for all classes with observation 41Numerical 0 probability for all classes with observation 42Numerical 0 probability for all classes with observation 43Numerical 0 probability for all classes with observation 44Numerical 0 probability for all classes with observation 45Numerical 0 probability for all classes with observation 46Numerical 0 probability for all classes with observation 47Numerical 0 probability for all classes with observation 48Numerical 0 probability for all classes with observation 49Numerical 0 probability for all classes with observation 50Numerical 0 probability for all classes with observation 51Numerical 0 probability for all classes with observation 52Numerical 0 probability for all classes with observation 53Numerical 0 probability for all classes with observation 54Numerical 0 probability for all classes with observation 55Numerical 0 probability for all classes with observation 56Numerical 0 probability for all classes with observation 57Numerical 0 probability for all classes with observation 58Numerical 0 probability for all classes with observation 59Numerical 0 probability for all classes with observation 60Numerical 0 probability for all classes with observation 61Numerical 0 probability for all classes with observation 62Numerical 0 probability for all classes with observation 63Numerical 0 probability for all classes with observation 64Numerical 0 probability for all classes with observation 65Numerical 0 probability for all classes with observation 66Numerical 0 probability for all classes with observation 67Numerical 0 probability for all classes with observation 68Numerical 0 probability for all classes with observation 69Numerical 0 probability for all classes with observation 70Numerical 0 probability for all classes with observation 71Numerical 0 probability for all classes with observation 72Numerical 0 probability for all classes with observation 73Numerical 0 probability for all classes with observation 74Numerical 0 probability for all classes with observation 75Numerical 0 probability for all classes with observation 76Numerical 0 probability for all classes with observation 77Numerical 0 probability for all classes with observation 78
# Model accuracy
mean(prediction.naive$class == test.pima.indians$diabetes)
[1] 0.8205128
The caet R package can automatically train the model and assess the modle accuracy using k-fold cross-validation
library(klaR)
set.seed(123)
model.naive2 <- train(diabetes~., data = train.pima.indians, method = "nb", trControl = trainControl("cv", number = 10))
Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32
# make predictions
predicted.classes.naive2 <- model.naive2 %>% predict(test.pima.indians)
Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 33Numerical 0 probability for all classes with observation 34Numerical 0 probability for all classes with observation 35Numerical 0 probability for all classes with observation 36Numerical 0 probability for all classes with observation 37Numerical 0 probability for all classes with observation 38Numerical 0 probability for all classes with observation 39Numerical 0 probability for all classes with observation 40Numerical 0 probability for all classes with observation 41Numerical 0 probability for all classes with observation 42Numerical 0 probability for all classes with observation 43Numerical 0 probability for all classes with observation 44Numerical 0 probability for all classes with observation 45Numerical 0 probability for all classes with observation 46Numerical 0 probability for all classes with observation 47Numerical 0 probability for all classes with observation 48Numerical 0 probability for all classes with observation 49Numerical 0 probability for all classes with observation 50Numerical 0 probability for all classes with observation 51Numerical 0 probability for all classes with observation 52Numerical 0 probability for all classes with observation 53Numerical 0 probability for all classes with observation 54Numerical 0 probability for all classes with observation 55Numerical 0 probability for all classes with observation 56Numerical 0 probability for all classes with observation 57Numerical 0 probability for all classes with observation 58Numerical 0 probability for all classes with observation 59Numerical 0 probability for all classes with observation 60Numerical 0 probability for all classes with observation 61Numerical 0 probability for all classes with observation 62Numerical 0 probability for all classes with observation 63Numerical 0 probability for all classes with observation 64Numerical 0 probability for all classes with observation 65Numerical 0 probability for all classes with observation 66Numerical 0 probability for all classes with observation 67Numerical 0 probability for all classes with observation 68Numerical 0 probability for all classes with observation 69Numerical 0 probability for all classes with observation 70Numerical 0 probability for all classes with observation 71Numerical 0 probability for all classes with observation 72Numerical 0 probability for all classes with observation 73Numerical 0 probability for all classes with observation 74Numerical 0 probability for all classes with observation 75Numerical 0 probability for all classes with observation 76Numerical 0 probability for all classes with observation 77Numerical 0 probability for all classes with observation 78
# Model n accuracy
mean(predicted.classes.naive2 == test.pima.indians$diabetes)
[1] 0.8076923
Support Vector Machine (or SVM) is a machine learning technique used for classification tasks. Briefly, SVM works by identifying the optimal decision boundary that separates data points from different groups (or classes), and then predicts the class of new observations based on the separation boundary.
Depending on the situations, the different groups might be separable by a linear straight line or by a non-linear boundary line.
Support vector machine methods can handle both linear and non-linear class boundaries. It can be used for both two-classs and multi-class classification problems.
In real life data, the separation boundary is generally nonlinear. Technically, the SVM algorithm perform a non-linear classification using what is called the kernel trick. The most commonly used kernel transformations ar e_polynomial kernel_ and radial kernel
Note that, there is also an extension of the SVM for regression, called support vector regression.
library(tidyverse)
library(caret)
Data set: PimaIndiansDiabetes2 [in mlbench package]
# Load the data
data("PimaIndiansDiabetes2", package = "mlbench")
pima.data.cleaned <- na.omit(PimaIndiansDiabetes2)
# Split the data into training and test set
set.seed(123)
t.sample <- pima.data.cleaned$diabetes %>% createDataPartition(p = 0.8, list = FALSE)
In the following example variables are normalized to make their scale comparable. This is automatically done before building the SVM classifier by setting the option preProcess = c(“center”, “scale”).
# Fit the model on the training set
set.seed(123)
model.svm <- train(
diabetes~., data = train.pima.indians, method = "svmLinear",
trContro = trainControl("cv", number = 10),
preProcess = c("center", "scale")
)
# Make predictions on the test data
predicted.classes.svm <- model.svm %>% predict(test.pima.indians)
head(predicted.classes.svm)
[1] neg pos neg pos pos neg
Levels: neg pos
# Computed model accuracy rate
mean(predicted.classes.svm == test.pima.indians$diabetes)
[1] 0.7820513
Note that, there is a tuning parameter C, also known as Cost, that determines the possible misclassifications. It essentially imposes a penalty to the model for making an error. The higher the value of C, the less likely it is that the SVM algorithm will misclassify a point.
By default caret builds the SVM linear classifier usin g C= 1. You can check this by typing model in R console
model.svm
Support Vector Machines with Linear Kernel
314 samples
8 predictor
2 classes: 'neg', 'pos'
Pre-processing: centered (8), scaled (8)
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 314, 314, 314, 314, 314, 314, ...
Resampling results:
Accuracy Kappa
0.7862095 0.4893315
Tuning parameter 'C' was held constant at a value of 1
The following R code compute SVM for a grid values of C and choose automatically the final model for predictions:
# Fit the model on the training set
set.seed(123)
model.svm2 <- train(
diabetes ~., data = train.pima.indians, method = "svmLinear",
trControl = trainControl("cv", number = 10),
tuneGrid = expand.grid(C = seq(0.1, 2, length = 20)),
preProcess = c("center", "scale")
)
# Plot model accuracy vs different values of Cost
plot(model.svm2)
# Print the best tuning parameter C that maximizes model accuracy
model.svm2$bestTune
Let’s use the best model
# Make predictions on the test data
predicted.classess.svm2 <- model.svm2 %>% predict(test.pima.indians)
# Compute model accuracy rate
mean(predicted.classess.svm2 == test.pima.indians$diabetes)
[1] 0.7820513
To build a non-linear SVM classifier, we can use either polynomial kernel or radial kernel function.
# Fit the model on the training set
set.seed(123)
model.svm.radial <- train(
diabetes~., data = train.pima.indians, method = "svmRadial",
trControl = trainControl("cv", number = 10),
preProcess = c("center", "scale"),
tuneLength = 10
)
# Print the best tuning parameter sigma and C that maximizes model accuracy
model.svm.radial$bestTune
# Make predictions on the test data
predicted.classess.svm.radial <- model.svm.radial %>% predict(test.pima.indians)
# Compute model Accuracy rate
mean(predicted.classess.svm.radial == test.pima.indians$diabetes)
[1] 0.7948718
# Fit the model on the training set
set.seed(123)
model.svm.polynomial <- train(
diabetes~., data = train.pima.indians, method = "svmPoly",
trControl = trainControl("cv", number = 10),
preProcess = c("center", "scale"),
tuneLength = 4
)
# Print the best tuning parameter sigma and C that maximizes model accuracy
model.svm.polynomial$bestTune
# Make predictions on the test data
predicted.classess.svm.polynomial <- model.svm.polynomial %>% predict(test.pima.indians)
# Compute model Accuracy rate
mean(predicted.classess.svm.polynomial == test.pima.indians$diabetes)
[1] 0.7948718
In our examples, it can be seen that the SVM classifier using non-linear kernel gives a better result compared to the linear model.
After building a predictive classification model, you need to evaluate the performance of the model, that is how good the model is in predicting the outcome of new observations test data that have been not used to train the model.
The common use metrics and methods for assessing the performance of predictive classification models:
Average classification accuracy - representing the proportion of correctly classified observations.
Confusion matrix - which is 2x2 table showing four parameters, including the number of true positives, true negatives, false negatives and false positives.
Precision, Recall and Specificity - which are three major performance metrics describing a predictive classification model
ROC curve - which is a graphical summary of the overall performance of the model, showing the proportion of true positives and false positives at tall possible values of probability cutoff. The Area Under the Curve(AUC) summarizes the overall performance of the classifier.
library(tidyverse)
library(caret)
To keep things simple we’ll perform a binary classification, where the outcome variable can have only two possible values: negative vs positive.
We’ll compute an example of linear discriminant analysis model using the PimaIndiansDiabetes2 [mlbench package]
data("PimaIndiansDiabetes2", package = "mlbench")
pima.data.cleaned <- na.omit(PimaIndiansDiabetes2)
# Split the data into training and test set
set.seed(123)
t.sample <- pima.data.cleaned$diabetes %>% createDataPartition(p = 0.8, list = FALSE)
train.pima2 <- pima.data.cleaned[t.sample, ]
test.pima2 <- pima.data.cleaned[-t.sample, ]
library(MASS)
#Fit LDA
fit.lda <- lda(diabetes ~., data = train.pima2)
# Make predictions on the test data
predictions.pima2 <- predict(fit.lda, test.pima2)
preictions.probabilities2 <- predictions.pima2$posterior[,2]
predicted.classess2 <- predictions.pima2$class
observed.classes2 <- test.pima2$diabetes
The overall classification accuracy rate corresponds to the proportion of observations that have been correctly classified. Determining the raw classification accuracy is the first step in assessing the performance of a model.
accuracy <- mean(observed.classes2 == predicted.classess2)
accuracy
[1] 0.8076923
error <- mean(observed.classes2 != predicted.classess2)
error
[1] 0.1923077
From the output avove, the linear discrimant analysis correctly predicted the individual outcome in 81% of the cases. This is by far better than random guessing. The misclassification error rate can be calculated as 100-81 = 19%
In our example, a binary classifier can make two types of errors:
The proportion of these two types of errors can be determined by creating a confution matrxi, which compare the predicted coutcome values agains the known outcome vaues.
The R function table() can be used to produce a confusion matrix in order to determin how many observations were correctly or incorrectly classified. It compares the observed and the predicted outcome values and shows the number of correct and incorrect predictions categorized by type of outcome.
# Confusion matrix, number of cases
table(observed.classes2, predicted.classess2)
predicted.classess2
observed.classes2 neg pos
neg 48 4
pos 11 15
# Confusion matrix, proportion of cases
table(observed.classes2, predicted.classess2) %>% prop.table() %>% round(digits = 3)
predicted.classess2
observed.classes2 neg pos
neg 0.615 0.051
pos 0.141 0.192
The diagonal elements of the confusion matrix indicate correct predictions, while the off diagonals represent incorrect prediction. So, the correct classification rate is the sum of the number on the diagonal divided by the sample size in the test data. In our example, that is (48+15)/78 = 81%
Technically the raw prediction accuracy of the model is defined as (TruePositives + TrueNegatives)/SampleSize.
In addition to the raw classification accuracy, there are many other metrics that are widely used to examine the performance of a classification model, including:
Precision, which is the proportion of true positives among all the individuals that have been predicted to be diabetes-positive by the model. This represents the accuracy of a predicted positive outcome. Precision = TruePositives/(TruePositives + FalsePOsitives)
Sensitivity (or Recall), which is the True Positive Rate (TPR) or the proportion of identified positives among the diabetes-positive population (class = 1). Sensitivity = TruePOsitives/(TruePositives + FalseNegatives).
Specificity, which measures the True Negative Rate (TNR), that is the proportion of identified negatives among the diabetes-negative population (class = 0). Specificity = TrueNegatives/(TrueNegatives + FalseNegatives).
False POsitive Rate (FPR), which represents the proportion of identified positives among the healthy individuals (i.e. diabetes-negative). This can be seen as a false alarm. The FPR can be also calculated as 1-specificity. When positives are rare, the FPR can be high, leading to the situation where a predicted positive is most likely a negative.
confusionMatrix(observed.classes2, predicted.classess2, positive = "pos")
Confusion Matrix and Statistics
Reference
Prediction neg pos
neg 48 4
pos 11 15
Accuracy : 0.8077
95% CI : (0.7027, 0.8882)
No Information Rate : 0.7564
P-Value [Acc > NIR] : 0.1787
Kappa : 0.5361
Mcnemar's Test P-Value : 0.1213
Sensitivity : 0.7895
Specificity : 0.8136
Pos Pred Value : 0.5769
Neg Pred Value : 0.9231
Prevalence : 0.2436
Detection Rate : 0.1923
Detection Prevalence : 0.3333
Balanced Accuracy : 0.8015
'Positive' Class : pos
In our example, the sensitivity is ~58%, that is the proportion of diabetes-positive individuals that were correctly identified by the model as diabetes-positive. The specificity of the model is ~92%, that is the proportion of diabetes-negative individuals that were correctly identified by the model as diabetes-negative. The model precision or the proportion of positive predicted value is 79%.
The ROC curve (or receiver operating characteristics curve) is apopular graphical measure for assessing the performance or the accuracy of a classifier, which corresponds to the total
The ROC analysis can be easily performed using the R package pROC
library(pROC)
res.roc <- roc(observed.classes2,preictions.probabilities2)
Setting levels: control = neg, case = pos
Setting direction: controls < cases
plot.roc(res.roc, print.auc = TRUE)
# Extract some interesting results
roc.data <- data_frame(
thresholds = res.roc$thresholds,
sensitivity = res.roc$sensitivities,
specificity = res.roc$specificities
)
`data_frame()` is deprecated, use `tibble()`.
[90mThis warning is displayed once per session.[39m
# Get the probability threshold for specificity = 0.6
roc.data %>% filter(specificity >= 0.6)
plot.roc(res.roc, print.auc = TRUE, print.thres = "best")
plot.roc(res.roc, print.thres = c(0.3,0.5,0.7))